{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "eb0bc0aa",
   "metadata": {},
   "source": [
    "## Description:\n",
    "这个笔记本， 尝试用协同过滤算法进行新闻的召回， 主要包括用户协同过滤和物品的协同过滤"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "2fb2b1ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "import gc\n",
    "import os\n",
    "import math\n",
    "#import toad\n",
    "import pickle\n",
    "import time\n",
    "import random\n",
    "\n",
    "# swifter库，以最快的可用方式将任何函数应用到pandas的DataFrame或者Series用于加速\n",
    "# 特别适用于一行一行处理DataFrame数据的时候，可以用多个处理器并行处理\n",
    "import swifter\n",
    "\n",
    "import numpy as np \n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n",
    "import collections\n",
    "from utils import reduce_mem, metrics_recall\n",
    "\n",
    "from CFModel import UserCF, ItemCF\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f5be20dc",
   "metadata": {},
   "source": [
    "## 导入数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9bb11099",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "开始压缩内存...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 59.37it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "压缩前: 571.13 Mb \n",
      "压缩后: 435.87 Mb \n",
      "压缩比例: (23.68 %)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "data_path = 'data_process'\n",
    "data = pd.read_csv(os.path.join(data_path, 'train_data.csv'), index_col=0, parse_dates=['expo_time'])\n",
    "# 内存优化下\n",
    "data = reduce_mem(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b0f86057",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选择出需要用到的列\n",
    "use_cols = ['user_id', 'article_id', 'expo_time', 'net_status', 'exop_position', 'duration', 'click']\n",
    "data_new = data[use_cols]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28684553",
   "metadata": {},
   "source": [
    "## 划分训练集和测试集\n",
    "* 训练集， 每个用户的历史点击，去掉最后一次\n",
    "* 测试集， 每个用户的最后一次点击"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d24bc5fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 按照用户分组，然后把最后一个item拿出来\n",
    "click_df = data_new[data_new['click']==1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1ee4fea4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_hist_and_last_click(all_click):\n",
    "    all_click = all_click.sort_values(by=['user_id', 'expo_time'])\n",
    "    click_last_df = all_click.groupby('user_id').tail(1)\n",
    "    \n",
    "    # 如果用户只有一个点击，hist为空了，会导致训练的时候这个用户不可见，此时默认泄露一下\n",
    "    def hist_func(user_df):\n",
    "        if len(user_df) == 1:\n",
    "            return user_df\n",
    "        else:\n",
    "            return user_df[:-1]\n",
    "\n",
    "    click_hist_df = all_click.groupby('user_id').apply(hist_func).reset_index(drop=True)\n",
    "\n",
    "    return click_hist_df, click_last_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1bcb2b54",
   "metadata": {},
   "outputs": [],
   "source": [
    "user_click_hist_df, user_click_last_df = get_hist_and_last_click(click_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "e2f416f9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>article_id</th>\n",
       "      <th>expo_time</th>\n",
       "      <th>net_status</th>\n",
       "      <th>exop_position</th>\n",
       "      <th>duration</th>\n",
       "      <th>click</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>17340</td>\n",
       "      <td>464481478</td>\n",
       "      <td>2021-06-30 20:34:47</td>\n",
       "      <td>2</td>\n",
       "      <td>21</td>\n",
       "      <td>27</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>17340</td>\n",
       "      <td>465148736</td>\n",
       "      <td>2021-07-02 19:35:03</td>\n",
       "      <td>5</td>\n",
       "      <td>23</td>\n",
       "      <td>49</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>17340</td>\n",
       "      <td>464707540</td>\n",
       "      <td>2021-07-02 19:47:06</td>\n",
       "      <td>5</td>\n",
       "      <td>25</td>\n",
       "      <td>174</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>17340</td>\n",
       "      <td>464993414</td>\n",
       "      <td>2021-07-02 19:47:06</td>\n",
       "      <td>5</td>\n",
       "      <td>27</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>17340</td>\n",
       "      <td>465115022</td>\n",
       "      <td>2021-07-02 20:01:34</td>\n",
       "      <td>5</td>\n",
       "      <td>41</td>\n",
       "      <td>14</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  article_id           expo_time  net_status  exop_position  \\\n",
       "0    17340   464481478 2021-06-30 20:34:47           2             21   \n",
       "1    17340   465148736 2021-07-02 19:35:03           5             23   \n",
       "2    17340   464707540 2021-07-02 19:47:06           5             25   \n",
       "3    17340   464993414 2021-07-02 19:47:06           5             27   \n",
       "4    17340   465115022 2021-07-02 20:01:34           5             41   \n",
       "\n",
       "   duration  click  \n",
       "0        27      1  \n",
       "1        49      1  \n",
       "2       174      1  \n",
       "3        11      1  \n",
       "4        14      1  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_click_hist_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "993d7975",
   "metadata": {},
   "source": [
    "## 协同过滤召回\n",
    "这里采用两种协同过滤， 基于用户的协同过滤和基于物品的协同过滤， 对于每一种协同过滤， 还可以尝试采用关联规则的方式进行优化， 主要是体现在相似分数的计算上"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6fc5fd4a",
   "metadata": {},
   "source": [
    "### 用户协同过滤"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c9a4c41b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "计算用户相似性矩阵.....\n",
      "用户协同过滤召回....\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 16%|████████████▏                                                                | 3154/20000 [05:42<22:50, 12.29it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "召回数量不足, 这里随机从看过的文章里面选....\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████| 20000/20000 [33:00<00:00, 10.10it/s]\n"
     ]
    }
   ],
   "source": [
    "usercf = UserCF(user_click_hist_df)\n",
    "user_recall_items_dict = usercf.usercf_recommend()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "803f0a99",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "计算用户相似性矩阵.....\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████| 47533/47533 [06:55<00:00, 114.44it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "用户协同过滤召回....\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 16%|████████████▏                                                                | 3154/20000 [13:29<48:16,  5.82it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "召回数量不足, 这里随机从看过的文章里面选....\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████| 20000/20000 [1:20:41<00:00,  4.13it/s]\n"
     ]
    }
   ],
   "source": [
    "usercf_corr = UserCF(user_click_hist_df, sim_corr_rule=True)\n",
    "user_recall_items_dict_corr = usercf_corr.usercf_recommend()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94ca3a51",
   "metadata": {},
   "source": [
    "### 文章协同过滤"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "0de6ffd5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "计算文章相似性矩阵.....\n",
      "文章协同过滤召回....\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 12%|█████████▎                                                                   | 2425/20000 [01:26<10:32, 27.80it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "召回数量不足, 这里随机从看过的文章里面选....\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 16%|████████████▏                                                                | 3153/20000 [01:50<08:56, 31.43it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "召回数量不足, 这里随机从看过的文章里面选....\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████| 20000/20000 [11:30<00:00, 28.94it/s]\n"
     ]
    }
   ],
   "source": [
    "itemcf = ItemCF(user_click_hist_df)\n",
    "user_recall_items_dict_itemcf = itemcf.itemcf_recommend()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "257962a0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "计算文章相似性矩阵.....\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████| 20000/20000 [02:54<00:00, 114.60it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "文章协同过滤召回....\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 12%|█████████▎                                                                   | 2424/20000 [02:01<13:55, 21.05it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "召回数量不足, 这里随机从看过的文章里面选....\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 16%|████████████▏                                                                | 3154/20000 [02:37<14:41, 19.11it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "召回数量不足, 这里随机从看过的文章里面选....\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████| 20000/20000 [16:15<00:00, 20.51it/s]\n"
     ]
    }
   ],
   "source": [
    "itemcf_corr = ItemCF(user_click_hist_df, sim_corr_rule=True)\n",
    "user_recall_items_dict_itemcf_corr = itemcf_corr.itemcf_recommend()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cdf17415",
   "metadata": {},
   "source": [
    "## 召回评估"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "baa6c948",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " topk:  50  :  hit_num:  2033 hit_rate:  0.10165 user_num :  20000\n",
      " topk:  100  :  hit_num:  3033 hit_rate:  0.15165 user_num :  20000\n",
      " topk:  150  :  hit_num:  3657 hit_rate:  0.18285 user_num :  20000\n",
      " topk:  200  :  hit_num:  4121 hit_rate:  0.20605 user_num :  20000\n"
     ]
    }
   ],
   "source": [
    "# 评估不加关联规则的用户协同过滤召回\n",
    "metrics_recall(user_recall_items_dict, user_click_last_df, topk=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "8fc49e6c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " topk:  50  :  hit_num:  2022 hit_rate:  0.1011 user_num :  20000\n",
      " topk:  100  :  hit_num:  3016 hit_rate:  0.1508 user_num :  20000\n",
      " topk:  150  :  hit_num:  3763 hit_rate:  0.18815 user_num :  20000\n",
      " topk:  200  :  hit_num:  4339 hit_rate:  0.21695 user_num :  20000\n"
     ]
    }
   ],
   "source": [
    "# 评估加关联规则的用户协同过滤召回\n",
    "metrics_recall(user_recall_items_dict_corr, user_click_last_df, topk=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "07422b0a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " topk:  50  :  hit_num:  2055 hit_rate:  0.10275 user_num :  20000\n",
      " topk:  100  :  hit_num:  2956 hit_rate:  0.1478 user_num :  20000\n",
      " topk:  150  :  hit_num:  3555 hit_rate:  0.17775 user_num :  20000\n",
      " topk:  200  :  hit_num:  4040 hit_rate:  0.202 user_num :  20000\n"
     ]
    }
   ],
   "source": [
    "# 评估不加关联规则的文章协同过滤召回\n",
    "metrics_recall(user_recall_items_dict_itemcf, user_click_last_df, topk=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "f21242d8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " topk:  50  :  hit_num:  2185 hit_rate:  0.10925 user_num :  20000\n",
      " topk:  100  :  hit_num:  3142 hit_rate:  0.1571 user_num :  20000\n",
      " topk:  150  :  hit_num:  3828 hit_rate:  0.1914 user_num :  20000\n",
      " topk:  200  :  hit_num:  4347 hit_rate:  0.21735 user_num :  20000\n"
     ]
    }
   ],
   "source": [
    "# 评估加关联规则的文章协同过滤召回\n",
    "metrics_recall(user_recall_items_dict_itemcf_corr , user_click_last_df, topk=200)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
